Our task is to analyze a set of medical data and create a model to classify a potential patient due to the fact of stroke.
A stroke is a set of clinical symptoms associated with the sudden onset of focal or generalized brain dysfunction, resulting from a disruption of cerebral circulation and persisting for more than 24 hours.
import pandas as pd
import numpy as np
import joblib
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from yellowbrick import classifier
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier.rocauc import roc_auc
import warnings
warnings.filterwarnings("ignore")
def plotting(data: pd.DataFrame, x: str, type: str, y=None, cl = 'stroke'):
"""
Function for plotting data in specific way
Args:
data - dataset to plot
x - column for x-axis
y - column for y-axis
type - type of plot
values: {'hist', 'box', 'hist_box', 'class_hist', 'class_box', 'class_hist_box', 'scatter', 'heatmap', 'lineplot'}
cl - class
Returns:
Plot in plotly
"""
def plot_class_hist():
"""
Plots histogram with stroke classes in plotly
"""
fig = go.Figure()
fig.add_trace(go.Histogram(x = data.loc[data[cl]==0, x], name='No '+cl))
fig.add_trace(go.Histogram(x = data.loc[data[cl]==1, x], name=cl))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
return fig
def plot_class_box():
"""
Plots boxplot with stroke classes in plotly
"""
fig = go.Figure()
fig.add_trace(go.Box(y = data.loc[data[cl]==0, x], name='No '+cl))
fig.add_trace(go.Box(y = data.loc[data[cl]==1, x], name=cl))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
return fig
if type=='hist':
fig = px.histogram(data, x=x)
fig.show()
elif type=='box':
fig = px.box(data, x=x)
fig.show()
elif type=='scatter':
data['temp'] = data[cl].astype('str')
fig = px.scatter(data, x=x, y=y, color='temp')
fig.update_layout(title_text="Scatterplot for " + x)
fig.update_traces(marker_size=10)
fig.show()
elif type=='heatmap':
corr = data.corr()
f, ax = plt.subplots(figsize=(16, 12))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)
elif type=='hist_box':
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x = data[x], name=x), row=1,col=1)
fig.add_trace(go.Box(y = data[x], name=x), row=1, col=2)
fig.update_layout(barmode="overlay")
fig.update_traces(opacity=0.7, row=1, col=1)
fig.update_layout(title_text="Histogram and Boxplot for " + x)
fig.show()
elif type=='class_hist':
fig = plot_class_hist()
fig.show()
elif type=='class_box':
fig = plot_class_box()
fig.show()
elif type=='class_hist_box':
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x = data.loc[data[cl]==0, x], name='No '+cl), row=1,col=1)
fig.add_trace(go.Histogram(x = data.loc[data[cl]==1, x], name=cl), row=1, col=1)
fig.add_trace(go.Box(y = data.loc[data[cl]==0, x], name='No '+cl), row=1, col=2)
fig.add_trace(go.Box(y = data.loc[data[cl]==1, x], name=cl), row=1, col=2)
fig.update_layout(barmode="overlay")
fig.update_traces(opacity=0.7, row=1, col=1)
fig.update_layout(title_text="Histogram and Boxplot with classes for " + x)
fig.show()
elif type=='lineplot':
fig = px.line(data, x=x, y=y, color=cl)
fig.show()
else:
raise NotImplementedError('Not implemented!')
return 0
def iqr_outliers(df: pd.DataFrame, col: str):
"""
Function for managin outliers in given column from dataframe using IQR method.
In descriptive statistics, the interquartile range (IQR) is a measure of statistical dispersion, which is the spread of the data.
It is defined as the difference between the 75th and 25th percentiles of the data.
Args:
df - dataset to remove outliers
col - column for removing outliers
Returns:
Whole dataframe with specified column cleaned from outliers using IQR method
"""
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
df.loc[df[col] <= Q1 - 1.5*IQR, col] = df[col].quantile(0.1)
df.loc[df[col] >= Q3 + 1.5*IQR, col] = df[col].quantile(0.9)
return df
def balance_data(X, y, algorithm, cat_features, strategy=1, seed = 123, k_neighbors = 5, n_neighbors=3):
"""
Function for balancing dataset using specified algorithm and sampling strategy.
Args:
X - matrix of features
y - vector of target variable observations
algorithm - algorithm for balancing dataset
values: {'SMOTE', 'SMOTETOMEK', 'SMOTEENN'}
strategy - sampling information to resample the data set
Optional **kwargs:
k_neighbors - number of neighbours for SMOTE algorithm
n_neighbors - number of neighbours for ADASYN and ENN algorithms
Returns:
Whole balanced dataframe
"""
ft = []
for feat in cat_features:
ft.append(X.columns.get_loc(feat))
#sm = SMOTE(sampling_strategy = strategy, k_neighbors = k_neighbors)
sm = SMOTENC(random_state=seed, categorical_features=ft, sampling_strategy = strategy, k_neighbors=k_neighbors)
if algorithm == 'SMOTE':
bs = sm
elif algorithm == 'SMOTETOMEK':
bs = SMOTETomek(random_state = seed, sampling_strategy = strategy, smote= sm)
elif algorithm == 'SMOTEENN':
enn = EditedNearestNeighbours(n_neighbors = n_neighbors)
bs = SMOTEENN(sampling_strategy = strategy, smote=sm, enn = enn)
else:
raise NotImplementedError('Not implemented!')
X_bs, y_bs = bs.fit_resample(X, y)
df_bs = X_bs.copy()
df_bs['stroke'] = y_bs
X_bs = X_bs.drop(columns='index', axis=1)
print("Liczba wszystkich obserwacji: ", len(y_bs))
print("\nLiczba obserwacji z klasy mniejszościowej: ", len(y_bs[y_bs==1]))
print("\nProcent klasy mniejszościowej do całości po zbalansowaniu: ", len(y_bs[y_bs == 1])/len(y_bs))
return df_bs, X_bs, y_bs
#Create class with threshold p selection
class LogisticRegressionWithThreshold(LogisticRegression):
def predict(self, X, threshold=None):
if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
return LogisticRegression.predict(self, X)
else:
y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
y_pred_with_threshold = (y_scores >= threshold).astype(int)
return y_pred_with_threshold
class SupportVectorMachineWithThreshold(SVC):
def predict(self, X, threshold=None):
if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
return SVC.predict(self, X)
else:
y_scores = SVC.predict_proba(self, X)[:, 1]
y_pred_with_threshold = (y_scores >= threshold).astype(int)
return y_pred_with_threshold
class RandomForestClassifierWithThreshold(RandomForestClassifier):
def predict(self, X, threshold=None):
if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
return RandomForestClassifier.predict(self, X)
else:
y_scores = RandomForestClassifier.predict_proba(self, X)[:, 1]
y_pred_with_threshold = (y_scores >= threshold).astype(int)
return y_pred_with_threshold
def preprocess_data(X_train, X_test, columns_to_standardize, cat_features, scaler=True, ohe=True, test_only=False, sc=None):
"""
Function preprocesses data by standardizing numerical features and one hot encoding categorical features.
Args:
X_train - train dataset of features
X_test - test dataset of features
columns_to_standardize - columns to standardize
cat_features - column for OHE
scaler - boolean value specifies if standardizing should be performed
ohe - boolean value specifies if OHE should be performed
test_only - flag if only test set should be preprocessed
sc - scaler
Returns
X
Preprocessed train data
X_t
Preprocessed test data
"""
def ohe(df: pd.DataFrame, cat_features):
df_ohe = df[cat_features].astype('str')
dummies = pd.get_dummies(df_ohe)
df = df.drop(columns=cat_features, axis=1)
df = pd.merge(df, dummies, left_index=True, right_index=True)
return df
if test_only==True:
X_t = X_test.copy()
if scaler == True:
X_t[columns_to_standardize] = sc.transform(X_t[columns_to_standardize])
if ohe == True:
X_t = ohe(X_t, cat_features)
return X_t
else:
X = X_train.copy()
X_t = X_test.copy()
sc = preprocessing.StandardScaler()
if scaler == True:
X[columns_to_standardize] = sc.fit_transform(X_train[columns_to_standardize])
X_t[columns_to_standardize] = sc.transform(X_t[columns_to_standardize])
if ohe == True:
X = ohe(X, cat_features)
X_t = ohe(X_t, cat_features)
return X, X_t, sc
def fit_pred_score(X_train, y_train, X_test, y_test, model, model_name, dataset_name, scaler=True, ohe = False, columns_to_standardize = None, cat_features = None, visualize_train = True, visualize_test = True, show_coefficients = False, threshold = None):
"""
Function for training and testing given classifier.
It plots classification reports for train and test data and might display coefficients of predictors.
Returns report in dataframe format for given model and dataset with metrics from classification report.
Args:
X_train, y_train, X_test, y_test - training and testing data with associated labels
model - model to train and test
model_name - name of the model for the report
dataset_name - dataset name used for the report
scaler - if features should be scaled
columns_to_standardize - columns which should be scaled if scaler = True
visualize_train - if classification report for training data should be visualized
visualize_test - if classification report for testing data should be visualized
show_coefficients - if coefficients of features should be displayed
threshold - value of 'p' for Logistic Regression model
Returns:
report - dataframe with metrics from classification report with associated dataset and model name
"""
X, X_t, standard_scaler = preprocess_data(X_train, X_test, columns_to_standardize = columns_to_standardize, cat_features = cat_features, scaler = scaler, ohe = ohe)
model.fit(X, y_train)
if threshold == None:
y_pred_train = model.predict(X)
y_pred = model.predict(X_t)
else:
y_pred_train = model.predict(X, threshold = threshold)
y_pred = model.predict(X_t, threshold = threshold)
classes=['No stroke', 'Stroke']
#plot classification report for training data
if visualize_train == True:
visualizer = ClassificationReport(
model, classes=classes)
visualizer.fit(X, y_train)
visualizer.score(X, y_train)
visualizer.show()
if visualize_test == True:
#plot classification report for test data
classifier.classification_report(model, X, y_train, X_t, y_test, classes=classes)
if show_coefficients == True:
coef_dict = {}
for coef, feat in zip(model.coef_[0,:],X.columns):
print(feat, " = ", coef)
report = pd.DataFrame()
for t, y, y_approx in zip(['train','test'], [y_train, y_test], [y_pred_train, y_pred]):
results = pd.DataFrame(metrics.classification_report(y, y_approx, target_names = classes, output_dict=True))
results['metric'] = results.index
results.reset_index(inplace=True, drop=True)
results['dataset_name'] = dataset_name
results['model_name'] = model_name
results['split'] = t
report = pd.concat([report, results])
return report
def transform_report_to_plot(dt : pd.DataFrame, x : str, cl = 'metric', train_test = False):
"""
Function transforms report dataset for plotting in a way that for F1-score macro avg value is taken, for precision the values for no stroke and for recall the values for stroke.
Args:
dt - filtered dataset
x - axis
cl - class to display
train_test - flag which specifies if data should be transformed to differences between train and test
Returns:
df - transformed report dataframe
"""
if train_test == True:
prefix = 'train_test_diff'
else:
prefix = ''
#cols_to_select
f1_s = dt.loc[(dt.metric == 'f1-score'),[prefix+'macro avg', cl, x]]
precision_s = dt.loc[(report.metric == 'precision'),[prefix+'Stroke', cl, x]]
precision_s[cl] = "Precision (Stroke)"
recall_s = dt.loc[(dt.metric == 'recall'),[prefix+'Stroke', cl, x]]
recall_s[cl] = "Recall (Stroke)"
precision_ns = dt.loc[(report.metric == 'precision'),[prefix+'No stroke', cl, x]]
precision_ns[cl] = "Precision (No stroke)"
recall_ns = dt.loc[(dt.metric == 'recall'),[prefix+'No stroke', cl, x]]
recall_ns[cl] = "Recall (No stroke)"
columns = ['value', cl, '']
for d in [f1_s, precision_s, recall_s, precision_ns, recall_ns]:
d.columns = columns
df = pd.concat([f1_s, precision_s, recall_s, precision_ns, recall_ns])
return df
def prepare_diff_dataset(df : pd.DataFrame(), by: str, cols_for_diff = ['No stroke', 'Stroke', 'accuracy', 'macro avg', 'weighted avg']):
"""
Takes the report dataset and returns dataset with calculated differences in metric between test and train by specified grouping.
Args:
df - dataset
by - specified grouping
cols_for_diff - cols to calculate differences
Returns:
df_diff
Dataset with calculated differences
"""
df_diff = df.copy()
for col in cols_for_diff:
df_diff['train_test_diff'+col] = None
for ds in df_diff[by].unique():
for m in df_diff.metric.unique():
filter = (df_diff[by] == ds) & (df_diff.metric == m)
df_diff.loc[filter, 'train_test_diff'+col] = np.abs(df_diff.loc[filter, col].diff())
return df_diff
url = 'https://github.com/maju116/Statistics_II_GUT/raw/main/PROJECT/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(url)
df.head(5)
| id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
| 1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
| 2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
| 3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
| 4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5110 non-null int64 1 gender 5110 non-null object 2 age 5110 non-null float64 3 hypertension 5110 non-null int64 4 heart_disease 5110 non-null int64 5 ever_married 5110 non-null object 6 work_type 5110 non-null object 7 Residence_type 5110 non-null object 8 avg_glucose_level 5110 non-null float64 9 bmi 4909 non-null float64 10 smoking_status 5110 non-null object 11 stroke 5110 non-null int64 dtypes: float64(3), int64(4), object(5) memory usage: 479.2+ KB
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 5110.0 | 36517.829354 | 21161.721625 | 67.00 | 17741.250 | 36932.000 | 54682.00 | 72940.00 |
| age | 5110.0 | 43.226614 | 22.612647 | 0.08 | 25.000 | 45.000 | 61.00 | 82.00 |
| hypertension | 5110.0 | 0.097456 | 0.296607 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
| heart_disease | 5110.0 | 0.054012 | 0.226063 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
| avg_glucose_level | 5110.0 | 106.147677 | 45.283560 | 55.12 | 77.245 | 91.885 | 114.09 | 271.74 |
| bmi | 4909.0 | 28.893237 | 7.854067 | 10.30 | 23.500 | 28.100 | 33.10 | 97.60 |
| stroke | 5110.0 | 0.048728 | 0.215320 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
We have a total of 12 columns of 5110 observations of which one is ('stroke') the target variable and we will try to model it. It is a binary variable indicating whether a particular patient had a stroke or not so it is a typical classification task. In addition to this, there are the other 11 variables:
We can see that we have 211 missing data for the variable bmi, which we will deal with later in the project.
df.stroke.value_counts()
0 4861 1 249 Name: stroke, dtype: int64
plotting(df, 'stroke', type = 'hist')
0
We can see that our target variable is unbalanced where the majority class is non-stroke patients. We need to keep this in mind, and if left as is, we can't use the accuracy metric, and should use another one - for example, the harmonic mean between precision and recall, or f1-score. However, an unbalanced dataset is quite problematic, as the classifier tends to focus on the prediction of the majority class.
There are ways to balance a dataset that are part of pre-processing that is, pre-processing the data before modeling. Some of the more popular methods include:
With undersampling, we can get rid of informationally relevant observations. Additionally, our dataset is small, so we wouldn't want to be limited to just 249 observations. Oversampling, on the other hand, can lead to over-fitting the model to our data. There is also a mixed approach using both undersampling and oversampling, but it is difficult to determine the optimal ratio between the approaches. We will try to empirically test the best method and restrict ourselves to it, of course, based not on simple sampling, but on available and known algorithms for balancing datasets.
print("Number of duplicate patient ID values: ", len(df[df.id.duplicated()]))
Number of duplicate patient ID values: 0
In our dataset, each patient is entered uniquely, so the ID variable is unlikely to be useful to us for anything else, and we will be able to leave it for later stages.
plotting(data = df, x='gender', type='class_hist')
0
df.gender.value_counts()
Female 2994 Male 2115 Other 1 Name: gender, dtype: int64
There are 3 values in the gender variable (male, female, other). The value 'Other' occurs only once, so it can be treated as missing data. Depending on the values of the other variable for this observation, we will either replace it with male or female (as the one that occurs most often) or remove it.
plotting(data = df, x='age', type='hist_box')
0
plotting(data = df, x='age', type='class_hist_box')
0
We can observe that the stroke patients in our collection are mainly elderly (60-80 years old). In contrast, people without stroke are fairly evenly distributed, with a median age of 43 years, where it is 71 years for people with stroke. In addition, there are outlier variables in the group of people with stroke. The distribution of the variable itself is close to a uniform distribution with a "coarse" estimate.
plotting(data = df, x='hypertension', type='class_hist')
0
It can be seen that the vast majority of patients do not have hypertension. Moreover, despite the lack of a balanced dataset, more stroke patients did not have hypertension at the same time.
plotting(data = df, x='heart_disease', type='class_hist')
0
Similar situation as for the variable hypertension. The vast majority of patients do not have heart disease and, despite the imbalance, most people with stroke also do not have heart disease.
plotting(data = df, x='ever_married', type='class_hist')
0
The ever_married variable is a categorical variable that will need to be coded at a value of (0,1). A larger proportion of patients were ever married and it is in this group that stroke occurs in greater numbers.
plotting(data = df, x='work_type', type='class_hist')
0
The work_type variable, which determines the type of work performed, is also a categorical variable, which we will code into numeric values. The largest number of patients is in the 'private' category, followed by an equally quantitative distribution between 'children', 'self_employed', 'govt_job'. A negligible number have never worked. The largest number of strokes is among the private and self-employed sectors. A small portion in the public sector. Single values are also found in the children's group, but from previous findings, these are most likely outliers. One could also try to categorize this variable and separate the burdened and unburdened sectors in terms of stroke risk.
plotting(data = df, x='Residence_type', type='class_hist')
0
The variable specifies the area of residence with a rural and urban split. It can be seen that the split is even and there is a similar proportion of post-stroke to non-stroke people in each division, so this variable probably carries little information for a potential model.
plotting(data = df, x='avg_glucose_level', type='hist_box')
0
plotting(data = df, x='avg_glucose_level', type='class_hist_box')
0
We can observe that there are the largest number of patients with average glucose levels in the range of 77.24 - 1st quantile, 114.09 - 2nd quantile, that is, with normal or pre-diabetic levels. This conclusion, however, is generalizable, as the range varies somewhat depending on the time or method of testing, as well as the age of the person being tested. Those with higher mean blood glucose levels are outliers for the non-stroke group, while for the rest of the subjects they fall into the IV quantile. So removing these observations, or changing their values, would result in the loss of perhaps significant information. Instead, we will try to take advantage of this relationship by creating a new categorical variable reporting the average glucose range (normal, pre-diabetic, diabetic) and imputing new values, as well as transforming the entire variable.
plotting(data = df, x='bmi', type='hist_box')
0
plotting(data = df, x='bmi', type='class_hist_box')
0
The distribution of the variable bmi resembles a normal distribution with a heavier right tail. There are also outliers for bmi > 47.5. Most patients are between 23.5 and 33.1, meaning they are normal weight or overweight to the first degree. While the variable itself may not carry much information due to the fact that the distribution of values is similar for people with and without stroke, it is nevertheless possible that there are some significant interactions between the other variables such as mean blood glucose levels. In the context of outlier data, it is also possible to create a categorical variable according to the BMI categories.
In addition, the BMI variable contains data gaps that need to be addressed. There are a number of methods to do this, among others.
Since data gaps are few (only 4%), filling them with descriptive statistics as a simple but effective measure. As mentioned, the distribution of the variable is close to normal so it does not matter much to choose between the median and the mean, but given the slight right tail the median will be chosen. For outlier observations we will use the IQR method.
plotting(data = df, x='smoking_status', type='class_hist_box')
0
It's hard to draw broader conclusions here given the fairly large 'Unknown' group, The largest group of patients had never smoked cigarettes assuming that in the 'Unknown' group the values would be fairly evenly distributed. You can see the distinction for stroke patients that they are mostly non-smokers or occasional smokers. Let's check the age of the people for the Unknown group, perhaps we will naturally separate the underage group.
plotting(data=df[(df.age <= 16)], x='smoking_status', type='class_hist')
0
We can see that for minors the value of smoking_status is mostly equal to 'Unknown'. Something needs to be done with this group of people, however, you can't just remove these observations, as stroke can also affect children. On the one hand, it is possible to classify these people into the 'Never smoked' group, but it is highly likely that in reality some people may 'smoke'. So we will create a new 'Underage' category for these patients.
In feature engineering, we will create new variables from existing ones, deal with missing data or outlier observations. For the latter, we will use the IQR interquartile range method. This is the difference between the third and first quartiles. Since between these quartiles is, by definition, 50% of all observations (located centrally in the distribution), so the greater the width of the interquartile range, the greater the variation in the trait. In addition, at the end we will perform balancing of our set using 4 techniques: SMOTE, SMOTE+Tomek, SMOTE ENN.
#deep data copy
df_clear = df.copy()
#imputation of the median in NA places
df_clear['bmi_no_nan'] = df_clear.bmi.fillna(df_clear.bmi.median())
#numeric variable without outliers
df_clear['bmi_no_outliers'] = df_clear.bmi_no_nan
df_clear = iqr_outliers(df_clear, 'bmi_no_outliers')
#log-transformation without outliers
df_clear['log_bmi_no_outliers'] = np.log(df_clear.bmi_no_nan)
df_clear = iqr_outliers(df_clear, 'log_bmi_no_outliers')
#new categorical variable according to BMI indicators
df_clear.loc[(df_clear.bmi_no_outliers < 18.5), 'bmi_categorical'] = 'underweight'
df_clear.loc[(df_clear.bmi_no_outliers >= 18.5) & (df_clear.bmi_no_outliers < 24.9), 'bmi_categorical'] = 'normal'
df_clear.loc[(df_clear.bmi_no_outliers >= 24.9) & (df_clear.bmi_no_outliers < 29.9), 'bmi_categorical'] = 'overweight'
df_clear.loc[(df_clear.bmi_no_outliers >= 29.9) & (df_clear.bmi_no_outliers < 34.9), 'bmi_categorical'] = 'obese'
df_clear.loc[(df_clear.bmi_no_outliers >= 34.9), 'bmi_categorical'] = 'extremely obese'
#interval median coding
df_clear.loc[df_clear.bmi_categorical == 'underweight', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'underweight','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'normal', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'normal','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'overweight', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'overweight','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'obese', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'obese','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'extremely obese', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'extremely obese','bmi_no_outliers'].median())
df_clear.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5110 non-null int64 1 gender 5110 non-null object 2 age 5110 non-null float64 3 hypertension 5110 non-null int64 4 heart_disease 5110 non-null int64 5 ever_married 5110 non-null object 6 work_type 5110 non-null object 7 Residence_type 5110 non-null object 8 avg_glucose_level 5110 non-null float64 9 bmi 4909 non-null float64 10 smoking_status 5110 non-null object 11 stroke 5110 non-null int64 12 bmi_no_nan 5110 non-null float64 13 bmi_no_outliers 5110 non-null float64 14 log_bmi_no_outliers 5110 non-null float64 15 bmi_categorical 5110 non-null object 16 bmi_categorical_encoded 5110 non-null float64 dtypes: float64(7), int64(4), object(6) memory usage: 678.8+ KB
for col, types in zip(['bmi_no_nan', 'bmi_categorical', 'bmi_no_outliers', 'log_bmi_no_outliers'], ['class_hist_box','class_hist','class_hist_box','class_hist_box']):
plotting(data = df_clear, x=col, type=types)
We can see that filling in data gaps resulted in a peak at the median point which is a natural effect of the method used.
In addition, the overweight group is the most numerous, and this is also where the most stroke patients are found. In the case of removing outliers, we can observe getting rid of the right-hand tail, while the use of log-transformation seems to have no significant effect other than rescaling, which will be unnecessary since we will be standardizing the variables at a later stage, so we will discard this variable.
Moreover, we can see that the assignment of quantile limits did not eliminate outlier data in the stroke group. In this situation, we can remove observations specifically for this group, but these are not isolated cases and the group is not very large anyway. In this case, due to the further slight cluttering of the variable by these observations, we will stay with the categorical variable binning to the appropriate group. This is a fairly common solution for BMI, as it is a common body weight index.
df_clear = df_clear.loc[df_clear.gender != 'Other',:]
df_clear.loc[df_clear.gender == 'Female', 'is_female'] = 1
df_clear.loc[df_clear.gender == 'Male', 'is_female'] = 0
df_clear.is_female = df_clear.is_female.astype('int')
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 0 to 5109 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 dtypes: float64(7), int32(1), int64(4), object(6) memory usage: 738.4+ KB
#board of stroke patients
df_stroke = df_clear[df_clear.stroke == 1].copy()
#getting rid of outliers
df_stroke = iqr_outliers(df=df_stroke, col='age')
#reconcatenation of the set
df_clear = pd.concat([df_clear[df_clear.stroke==0], df_stroke])
#plot
plotting(data=df_clear, x='age', type='class_hist_box')
#summary
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 dtypes: float64(7), int32(1), int64(4), object(6) memory usage: 738.4+ KB
df_clear.loc[df_clear.ever_married == 'Yes', 'is_married'] = 1
df_clear.loc[df_clear.ever_married == 'No', 'is_married'] = 0
df_clear.is_married = df_clear.is_married.astype('int')
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 18 is_married 5109 non-null int32 dtypes: float64(7), int32(2), int64(4), object(6) memory usage: 758.4+ KB
le = preprocessing.LabelEncoder()
le.fit(df_clear.work_type)
df_clear['work_type_encoded'] = le.transform(df_clear.work_type)
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 18 is_married 5109 non-null int32 19 work_type_encoded 5109 non-null int32 dtypes: float64(7), int32(3), int64(4), object(6) memory usage: 778.3+ KB
df_clear.loc[df_clear.Residence_type == 'Rural', 'is_rural'] = 1
df_clear.loc[df_clear.Residence_type == 'Urban', 'is_rural'] = 0
df_clear.is_rural = df_clear.is_rural.astype('int')
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 18 is_married 5109 non-null int32 19 work_type_encoded 5109 non-null int32 20 is_rural 5109 non-null int32 dtypes: float64(7), int32(4), int64(4), object(6) memory usage: 798.3+ KB
#new categorical variable according to BMI indicators
df_clear.loc[(df_clear.avg_glucose_level < 140), 'glucose_categorical'] = 'normal'
df_clear.loc[(df_clear.avg_glucose_level >= 140) & (df_clear.avg_glucose_level <= 199), 'glucose_categorical'] = 'prediabetes'
df_clear.loc[(df_clear.avg_glucose_level > 199), 'glucose_categorical'] = 'diabetes'
#coding
le_gl = preprocessing.LabelEncoder()
le_gl.fit(df_clear.glucose_categorical)
df_clear['glucose_categorical_encoded'] = le_gl.transform(df_clear.glucose_categorical)
df_clear.info()
#numeric variable without outliers
df_clear['glucose_no_outliers'] = df_clear.avg_glucose_level
df_clear = iqr_outliers(df_clear, 'glucose_no_outliers')
#log-transformation without outliers
df_clear['log_glucose_no_outliers'] = np.log(df_clear.avg_glucose_level)
df_clear = iqr_outliers(df_clear, 'log_glucose_no_outliers')
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 18 is_married 5109 non-null int32 19 work_type_encoded 5109 non-null int32 20 is_rural 5109 non-null int32 21 glucose_categorical 5109 non-null object 22 glucose_categorical_encoded 5109 non-null int32 dtypes: float64(7), int32(5), int64(4), object(7) memory usage: 858.2+ KB
for col, types in zip(['glucose_categorical', 'glucose_no_outliers', 'log_glucose_no_outliers'], ['class_hist','class_hist_box','class_hist_box']):
plotting(data = df_clear, x=col, type=types)
The group of people with the normal category of average blood glucose level is the most numerous, and this is also where people with stroke are most numerous (degree I obesity was also included there).
In the context of the categorization of the variable, rather a drop in information was obtained, as the single most numerous group of people with stroke and others with similar numbers remained. Moreover, as mentioned, it's hard to categorize unequivocally because we don't know the characteristics of the sugar measurement, as well as additional diseases of patients such as diabetes.
We also see that the IQR method with imputation, instead of removing the data, further retained some outlier data. However, when using log-transformation, these are the few values that are strongly close to the upper value. With this in mind, as well as the fact that the information drops after categorization, we will limit ourselves to this variable.
df_clear['smoking_status_new'] = df_clear.smoking_status
df_clear.loc[df_clear.age <= 21, 'smoking_status_new'] = 'underage'
print("The 'Unknown' group represents %.2f percent of observations of the total set" % (len(df_clear[df_clear.smoking_status_new == 'underage'])/len(df_clear)))
plotting(data=df_clear, x='smoking_status_new', type='class_hist')
df_clear.info()
The 'Unknown' group represents 0.21 percent of observations of the total set
<class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5109 non-null int64 1 gender 5109 non-null object 2 age 5109 non-null float64 3 hypertension 5109 non-null int64 4 heart_disease 5109 non-null int64 5 ever_married 5109 non-null object 6 work_type 5109 non-null object 7 Residence_type 5109 non-null object 8 avg_glucose_level 5109 non-null float64 9 bmi 4908 non-null float64 10 smoking_status 5109 non-null object 11 stroke 5109 non-null int64 12 bmi_no_nan 5109 non-null float64 13 bmi_no_outliers 5109 non-null float64 14 log_bmi_no_outliers 5109 non-null float64 15 bmi_categorical 5109 non-null object 16 bmi_categorical_encoded 5109 non-null float64 17 is_female 5109 non-null int32 18 is_married 5109 non-null int32 19 work_type_encoded 5109 non-null int32 20 is_rural 5109 non-null int32 21 glucose_categorical 5109 non-null object 22 glucose_categorical_encoded 5109 non-null int32 23 glucose_no_outliers 5109 non-null float64 24 log_glucose_no_outliers 5109 non-null float64 25 smoking_status_new 5109 non-null object dtypes: float64(9), int32(5), int64(4), object(8) memory usage: 977.9+ KB
The Unknown group is still quite large, but we have brought out a more diverse distribution instead and will leave it that way. This way we can see that in almost every group we have an equal proportion of people with stroke as well as without stroke. This is not a very informative variable detailing information on stroke patients. We could isolate the 'underage' group here, and aggregate the others to get a binary variable, but it will really only be based on age, not actual smoking status. So we will discard this variable in further analysis.
# exclude categorical data
plotting(data=df_clear.loc[:,~df_clear.columns.isin(
['id', 'bmi_categorical_encoded','work_type_encoded','glucose_categorical_encoded'])], x='none', type='heatmap')
0
Weak correlation is seen between variables and between variables and the target variable. The largest values are seen with the same variables, but after the transformation or coding which is an obvious result - only the is_married variable correlates with the age variable, but we will not, however, exclude any of them. So there is no problem of collinearity here. It is also hard to make a selection of variables on this basis, so we will rather use LASSO regularization at a later stage.
Very often the input variables are interactive with each other in a non-linear way. Such interactions can later be detected and used by the model at the variable selection stage. What's more, raising some variables to a certain power can help to extract important relationships between that variable and the target variable. Mostly, raising to the second (possibly third) degree is enough, as polynomials of the fourth and higher degree are excessively flexible, have irregular shapes and can lead to overfitting. We will use polynomial features of degree two.
#get PolynomialFeatures transfomer
poly = preprocessing.PolynomialFeatures(2)
#select only apriori chosen variables
X_poly = df_clear.select_dtypes(['int32','int64','float32','float64'])
X_poly = X_poly.loc[:,~X_poly.columns.isin(['id', 'bmi_no_nan','bmi','avg_glucose_level','glucose_no_outliers', 'log_bmi_no_outliers', 'bmi_no_outliers','glucose_categorical_encoded'])]
print("Ramka wejściowa")
X_poly.info()
print("\nRamka wyjściowa po PolyFeatures transformacji")
df_poly = pd.DataFrame(poly.fit_transform(X_poly), columns = poly.get_feature_names_out(X_poly.columns))
df_poly.info()
Ramka wejściowa <class 'pandas.core.frame.DataFrame'> Int64Index: 5109 entries, 249 to 248 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 5109 non-null float64 1 hypertension 5109 non-null int64 2 heart_disease 5109 non-null int64 3 stroke 5109 non-null int64 4 bmi_categorical_encoded 5109 non-null float64 5 is_female 5109 non-null int32 6 is_married 5109 non-null int32 7 work_type_encoded 5109 non-null int32 8 is_rural 5109 non-null int32 9 log_glucose_no_outliers 5109 non-null float64 dtypes: float64(3), int32(4), int64(3) memory usage: 359.2 KB Ramka wyjściowa po PolyFeatures transformacji <class 'pandas.core.frame.DataFrame'> RangeIndex: 5109 entries, 0 to 5108 Data columns (total 66 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 1 5109 non-null float64 1 age 5109 non-null float64 2 hypertension 5109 non-null float64 3 heart_disease 5109 non-null float64 4 stroke 5109 non-null float64 5 bmi_categorical_encoded 5109 non-null float64 6 is_female 5109 non-null float64 7 is_married 5109 non-null float64 8 work_type_encoded 5109 non-null float64 9 is_rural 5109 non-null float64 10 log_glucose_no_outliers 5109 non-null float64 11 age^2 5109 non-null float64 12 age hypertension 5109 non-null float64 13 age heart_disease 5109 non-null float64 14 age stroke 5109 non-null float64 15 age bmi_categorical_encoded 5109 non-null float64 16 age is_female 5109 non-null float64 17 age is_married 5109 non-null float64 18 age work_type_encoded 5109 non-null float64 19 age is_rural 5109 non-null float64 20 age log_glucose_no_outliers 5109 non-null float64 21 hypertension^2 5109 non-null float64 22 hypertension heart_disease 5109 non-null float64 23 hypertension stroke 5109 non-null float64 24 hypertension bmi_categorical_encoded 5109 non-null float64 25 hypertension is_female 5109 non-null float64 26 hypertension is_married 5109 non-null float64 27 hypertension work_type_encoded 5109 non-null float64 28 hypertension is_rural 5109 non-null float64 29 hypertension log_glucose_no_outliers 5109 non-null float64 30 heart_disease^2 5109 non-null float64 31 heart_disease stroke 5109 non-null float64 32 heart_disease bmi_categorical_encoded 5109 non-null float64 33 heart_disease is_female 5109 non-null float64 34 heart_disease is_married 5109 non-null float64 35 heart_disease work_type_encoded 5109 non-null float64 36 heart_disease is_rural 5109 non-null float64 37 heart_disease log_glucose_no_outliers 5109 non-null float64 38 stroke^2 5109 non-null float64 39 stroke bmi_categorical_encoded 5109 non-null float64 40 stroke is_female 5109 non-null float64 41 stroke is_married 5109 non-null float64 42 stroke work_type_encoded 5109 non-null float64 43 stroke is_rural 5109 non-null float64 44 stroke log_glucose_no_outliers 5109 non-null float64 45 bmi_categorical_encoded^2 5109 non-null float64 46 bmi_categorical_encoded is_female 5109 non-null float64 47 bmi_categorical_encoded is_married 5109 non-null float64 48 bmi_categorical_encoded work_type_encoded 5109 non-null float64 49 bmi_categorical_encoded is_rural 5109 non-null float64 50 bmi_categorical_encoded log_glucose_no_outliers 5109 non-null float64 51 is_female^2 5109 non-null float64 52 is_female is_married 5109 non-null float64 53 is_female work_type_encoded 5109 non-null float64 54 is_female is_rural 5109 non-null float64 55 is_female log_glucose_no_outliers 5109 non-null float64 56 is_married^2 5109 non-null float64 57 is_married work_type_encoded 5109 non-null float64 58 is_married is_rural 5109 non-null float64 59 is_married log_glucose_no_outliers 5109 non-null float64 60 work_type_encoded^2 5109 non-null float64 61 work_type_encoded is_rural 5109 non-null float64 62 work_type_encoded log_glucose_no_outliers 5109 non-null float64 63 is_rural^2 5109 non-null float64 64 is_rural log_glucose_no_outliers 5109 non-null float64 65 log_glucose_no_outliers^2 5109 non-null float64 dtypes: float64(66) memory usage: 2.6 MB
As we can see, we have obtained all possible combinations of 2nd degree polynomial variables. However, we will exclude to start with variables that we will certainly not use such as bias: '1' and binary variables squared, as well as variables with 'stroke' interaction in order to avoid the so-called. data leakage.
col_drop = list(df_poly.filter(regex='stroke'))[1:] + ['1','hypertension^2','heart_disease^2','is_female^2','is_married^2','is_rural^2']
df_poly = df_poly[df_poly.columns.drop(col_drop)]
cat_features = ['is_rural','is_married', 'is_female', 'hypertension', 'heart_disease', 'work_type_encoded']
col_filter = set(list(df_poly.filter(regex='age')) + list(df_poly.filter(regex='log_glucose_no_outliers')) + cat_features + ['bmi_categorical_encoded','stroke'])
df_poly = df_poly[col_filter]
df_poly.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5109 entries, 0 to 5108 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age heart_disease 5109 non-null float64 1 is_married log_glucose_no_outliers 5109 non-null float64 2 age is_married 5109 non-null float64 3 age hypertension 5109 non-null float64 4 age log_glucose_no_outliers 5109 non-null float64 5 age bmi_categorical_encoded 5109 non-null float64 6 is_married 5109 non-null float64 7 age is_rural 5109 non-null float64 8 bmi_categorical_encoded 5109 non-null float64 9 log_glucose_no_outliers^2 5109 non-null float64 10 is_female log_glucose_no_outliers 5109 non-null float64 11 stroke 5109 non-null float64 12 heart_disease log_glucose_no_outliers 5109 non-null float64 13 heart_disease 5109 non-null float64 14 is_rural log_glucose_no_outliers 5109 non-null float64 15 age^2 5109 non-null float64 16 hypertension log_glucose_no_outliers 5109 non-null float64 17 work_type_encoded 5109 non-null float64 18 work_type_encoded log_glucose_no_outliers 5109 non-null float64 19 is_female 5109 non-null float64 20 age is_female 5109 non-null float64 21 age work_type_encoded 5109 non-null float64 22 bmi_categorical_encoded log_glucose_no_outliers 5109 non-null float64 23 log_glucose_no_outliers 5109 non-null float64 24 hypertension 5109 non-null float64 25 is_rural 5109 non-null float64 26 age 5109 non-null float64 dtypes: float64(27) memory usage: 1.1 MB
It is important to balance the data only on the training set, because in general the test set is a set that is supposed to reflect the real data - and the balanced ones are not.
#Extracting the matrix of variables X and the vector of the target variable Y
X = df_poly.loc[:,~df_poly.columns.isin(['stroke'])]
Y = df_poly.stroke
#train-test split to balance only on the training set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=123)
#for visualization
X_train['index'] = X_train.reset_index().index
df_imbalanced = X_train.copy()
df_imbalanced['stroke'] = y_train
#plot showing the problem of unbalanced data - the class of stroke patients is almost invisible
plotting(data = df_imbalanced,x='index', y='age', type='scatter')
0
SMOTE uses the KNN algorithmic approach i.e. for each observation $x$ selects $k$ nearest neighbors and randomly selects one $x'$ of them. Then the difference between the $x$ and $x'$ coordinates is calculated and multiplied randomly by numbers in the interval $(0,1)$. The $x''$ observation thus created from the differences is added to the set. Geometrically, this maps the creation of a new observation based on the displacement of the current observation to one of its neighbors.
# SMOTE
df_smote, X_smote, y_smote = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm = 'SMOTE', strategy=1, k_neighbors = 7)
plotting(data = df_smote, x='index', y='age', type='scatter')
Liczba wszystkich obserwacji: 6530 Liczba obserwacji z klasy mniejszościowej: 3265 Procent klasy mniejszościowej do całości po zbalansowaniu: 0.5
0
SMOTE + Tomek is a hybrid technique that combines the oversampling technique of the SMOTE algorithm with the undersampling technique of the Tomek algorithm. After using SMOTE, there is a possibility of overlapping class clusters, which can result in overfitting. In that case, the Tomek algorithm is additionally used, which involves pairing samples from different classes together, and then observations from the majority class are removed in order to (sometimes from both) separate the classes more clearly.
# SMOTE Tomek
df_smtom, X_smtom, y_smtom = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm='SMOTETOMEK', strategy = 1, k_neighbors=7)
plotting(data = df_smtom, x='index', y='age', type='scatter')
Liczba wszystkich obserwacji: 6432 Liczba obserwacji z klasy mniejszościowej: 3216 Procent klasy mniejszościowej do całości po zbalansowaniu: 0.5
0
SMOTE + ENN is a hybrid technique that involves the additional removal of a number of observations from the feature space. Once a collection is oversampled by SMOTE then ENN, which is an undersampling technique, is applied next. ENN (Edited Nearest Neighbor) involves finding the K nearest neighbors of each observation. It then checks if the majority class of these neighbors matches the class of the observation, if not then both the observation and the neighbors are removed. With this approach, the separation of classes from each other is clearer than in other cases.
# SMOTE ENN
df_smenn, X_smenn, y_smenn = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm='SMOTEENN', strategy = 1)
plotting(data = df_smenn, x='index', y='age',type='scatter')
Liczba wszystkich obserwacji: 6320 Liczba obserwacji z klasy mniejszościowej: 3055 Procent klasy mniejszościowej do całości po zbalansowaniu: 0.4833860759493671
0
We can see that each algorithm balances in its own specific way generates artificial observations for the minority class. In the SMOTE ENN algorithm, we can further see a decrease in the number of observations due to an explicit undersampling technique. We will perform model training on each set for each model to have a broader view of the results. Moreover, we will also perform LASSO regularization for each of them.
In general, for evaluating solutions to classification problems, a confusion matrix is used.
It juxtaposes the true class values with the modeled class values. In the binary case, this can be represented as TP, or cases classified correctly into the true class, TN, or cases classified correctly into the false class, FP, or cases classified incorrectly into the true class, and FN, or cases classified incorrectly into the false class. FP values are so-called Type I error, that is, classifying something as true when in fact it is not. FN values, on the other hand, are type II error, that is, classifying something as false when in fact it is true.
Of this combination, the most popular metrics are:
In our case, we want to focus on minimizing Type II error at the expense of Type I error. This is because if a Type I error is made, i.e., a false prediction of stroke, unnecessary tests may be performed and financial resources incurred, while a Type II error carries serious health consequences and, in the worst case, death.
However, it is not enough to minimize FN, because in such a case every subsequent patient would be classified as a stroke patient. Some balance is needed between FP and FN therefore we will be guided by the metrics Precision - that is, how many people classified with a stroke actually have a stroke (a high precision value implies a low FP value) and Recall (a high recall value implies a low FN value), as well as their harmonic mean $F_1$.
In summary, for the class of stroke patients, the more relevant metric is Recall, which tells us what is the proportion of people classified with stroke to all people who should be classified. We expect a high value, as this will mean that we are correctly searching for people with stroke.
For non-stroke patients, we will look more at the Precision metric, which in turn tells us how many classified as non-stroke actually do not have a stroke. A high value will mean that we are accurately searching for people without stroke.
There are many variables in our data set after the introduction of polynomial features. While we did not remove too many of them during data preprocessing and still the number of variables is much smaller than the number of observations, still an excessive number of them can lead to overfitting phenomena, affect explainability, or introduce unnecessary noise into the model.
Therefore, we want to reduce this number by selecting the most frequent variables (although other ways like dimensionality reduction through PCA, for example, are possible).
We will train 3 models:
logistic regression
the SVM or support vector machine.
randomForest
Each model will be trained after prior feature selection using logistic regression with LASSO regularization. This regularization zeros out certain coefficients and it will be trained using several regularization parameters among which the best one for the ROC AUC metric will be selected.
After selecting these variables, we will train all models on this selected set, along with searching the grid of hyperparameters of each model maximizing the ROC AUC score which will help us select the final model.
In the last step, we will proceed to the selection of threshold $p$, which is the probability value from which we will classify stroke patients. For logistic regression and RFC, this is natural, as these methods are inherently probabilistic and their algorithms return values for classification probabilities. For SVM, we can set the probability=True hyperparameter value, so we are able to retrieve these probabilities.
The whole thing is done behind the scenes using logistic regression on the SVM results data (Platt's algorithm). This technique is not entirely reliable and has caveats, namely:
In the context of the caveats setting our own threshold, we will somehow control this thereby reducing the effectiveness of the SVM algorithm itself, and focusing on the effectiveness of the Platt technique in estimating these probabilities.
report = pd.DataFrame()
lr_base = LogisticRegressionWithThreshold(penalty='l1', solver='saga',max_iter=10000, random_state = 123)
for X, y, name in zip([X_smote, X_smtom, X_smenn],[y_smote,y_smtom, y_smenn], ['SMOTE', 'SMOTE+TOMEK','SMOTE+ENN']):
print('\n'+name +'\n')
columns_to_standardize = list(set(list(X.filter(regex='age')) + list(X.filter(regex='log_glucose_no_outliers'))))
results = fit_pred_score(X, y, X_test, y_test, lr_base, dataset_name = name, model_name = 'Base LR', scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features, visualize_test = True, visualize_train = True)
report = pd.concat([report, results])
report.reset_index(inplace=True, drop=True)
report.sort_values(by=['model_name','metric','dataset_name'])
SMOTE
SMOTE+TOMEK
SMOTE+ENN
| No stroke | Stroke | accuracy | macro avg | weighted avg | metric | dataset_name | model_name | split | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.840624 | 0.840233 | 0.840429 | 0.840429 | 0.840429 | f1-score | SMOTE | Base LR | train |
| 6 | 0.908547 | 0.246575 | 0.836892 | 0.577561 | 0.872818 | f1-score | SMOTE | Base LR | test |
| 18 | 0.821209 | 0.820244 | 0.820728 | 0.820727 | 0.820743 | f1-score | SMOTE+ENN | Base LR | train |
| 22 | 0.886301 | 0.265487 | 0.803084 | 0.575894 | 0.852794 | f1-score | SMOTE+ENN | Base LR | test |
| 10 | 0.842220 | 0.842171 | 0.842195 | 0.842195 | 0.842195 | f1-score | SMOTE+TOMEK | Base LR | train |
| 14 | 0.908364 | 0.258760 | 0.836892 | 0.583562 | 0.873302 | f1-score | SMOTE+TOMEK | Base LR | test |
| 0 | 0.839597 | 0.841265 | 0.840429 | 0.840431 | 0.840431 | precision | SMOTE | Base LR | train |
| 4 | 0.967422 | 0.164234 | 0.836892 | 0.565828 | 0.924071 | precision | SMOTE | Base LR | test |
| 16 | 0.847005 | 0.795874 | 0.820728 | 0.821440 | 0.822289 | precision | SMOTE+ENN | Base LR | train |
| 20 | 0.976604 | 0.166205 | 0.803084 | 0.571404 | 0.932863 | precision | SMOTE+ENN | Base LR | test |
| 8 | 0.842089 | 0.842302 | 0.842195 | 0.842195 | 0.842195 | precision | SMOTE+TOMEK | Base LR | train |
| 12 | 0.969417 | 0.171429 | 0.836892 | 0.570423 | 0.926346 | precision | SMOTE+TOMEK | Base LR | test |
| 1 | 0.841654 | 0.839204 | 0.840429 | 0.840429 | 0.840429 | recall | SMOTE | Base LR | train |
| 5 | 0.856426 | 0.494505 | 0.836892 | 0.675466 | 0.836892 | recall | SMOTE | Base LR | test |
| 17 | 0.796937 | 0.846154 | 0.820728 | 0.821546 | 0.820728 | recall | SMOTE+ENN | Base LR | train |
| 21 | 0.811285 | 0.659341 | 0.803084 | 0.735313 | 0.803084 | recall | SMOTE+ENN | Base LR | test |
| 9 | 0.842351 | 0.842040 | 0.842195 | 0.842195 | 0.842195 | recall | SMOTE+TOMEK | Base LR | train |
| 13 | 0.854545 | 0.527473 | 0.836892 | 0.691009 | 0.836892 | recall | SMOTE+TOMEK | Base LR | test |
| 3 | 3265.000000 | 3265.000000 | 0.840429 | 6530.000000 | 6530.000000 | support | SMOTE | Base LR | train |
| 7 | 1595.000000 | 91.000000 | 0.836892 | 1686.000000 | 1686.000000 | support | SMOTE | Base LR | test |
| 19 | 3265.000000 | 3055.000000 | 0.820728 | 6320.000000 | 6320.000000 | support | SMOTE+ENN | Base LR | train |
| 23 | 1595.000000 | 91.000000 | 0.803084 | 1686.000000 | 1686.000000 | support | SMOTE+ENN | Base LR | test |
| 11 | 3216.000000 | 3216.000000 | 0.842195 | 6432.000000 | 6432.000000 | support | SMOTE+TOMEK | Base LR | train |
| 15 | 1595.000000 | 91.000000 | 0.836892 | 1686.000000 | 1686.000000 | support | SMOTE+TOMEK | Base LR | test |
plot_datasets_scores = transform_report_to_plot(report[(report.model_name=='Base LR') & (report.split=='test')], x='dataset_name', cl='metric')
plotting(data = plot_datasets_scores, x='', y='value', cl='metric', type='lineplot')
print("A graph of the dependence of the values of selected metrics on the balancing technique")
A graph of the dependence of the values of selected metrics on the balancing technique
report_diff = prepare_diff_dataset(report, by='dataset_name')
plot_diff_scores = transform_report_to_plot(report_diff[(report_diff.model_name=='Base LR') & (report_diff.train_test_diffaccuracy.isna()==False)], x='dataset_name', cl='metric', train_test=True)
plotting(data = plot_diff_scores, x='', y='value', cl='metric', type='lineplot')
print("Plot of the dependence of differences between training and test set values of selected metrics on balancing technique")
Plot of the dependence of differences between training and test set values of selected metrics on balancing technique
For a general look, we look at the 'macro-avg' metric as it does not take into account the proportion of classes (in terms of number of occurrences), and we want the 'Stroke' class to be as important in evaluating the model.
Thus for:
In general, it can be seen that the model performs quite well without performing preprocessing and selecting hyperparameters. In our case, the choice of data balancing method is not likely to be very important, but 3 possibilities were selected in order to show different techniques as well as some impact on possible results. Here the differences are really small, but due to the higher values, as well as lower differences relative to the training and test set, we will choose the SMOTE+ENN hybrid method.
X = X_smenn.copy()
y = y_smenn.copy()
dataset_name = 'SMOTE+ENN'
columns_to_standardize = list(set(list(X.filter(regex='age')) + list(X.filter(regex='log_glucose_no_outliers')))) + ['bmi_categorical_encoded'] #bmi jest skategoryzowana, ale medianą w celu zachowania pewnej informacji w wartości dlatego ją standaryzujemy
X_train_preprocessed, X_test_preprocessed, standard_scaler = preprocess_data(X, X_test, scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features)
To select variables we will use the LASSO method in this case it will be logistic regression with a penalty function in the L1 metric. We will search the grid of C adjusting parameters (the lower the value, the more restrictive the penalty function). We will maximize the ROC AUC score, that is, maximize the number of true positive predictions while minimizing negative predictions. We will then select the appropriate variables from the best estimator using the SelectFromModel module.
lr_grid = {'C': [1, 0.01, 0.001, 0.0001, 0.00001]}
lr = LogisticRegressionWithThreshold(penalty='l1', solver='saga',max_iter=10000, random_state = 123)
lr_search = GridSearchCV(estimator = lr, param_grid = lr_grid, scoring = 'f1',
cv = 3, n_jobs = -1)
lr_search.fit(X_train_preprocessed,y)
GridSearchCV(cv=3,
estimator=LogisticRegressionWithThreshold(max_iter=10000,
penalty='l1',
random_state=123,
solver='saga'),
n_jobs=-1, param_grid={'C': [1, 0.01, 0.001, 0.0001, 1e-05]},
scoring='f1')
lr = lr_search.best_estimator_
selection_model = SelectFromModel(lr).fit(X_train_preprocessed, y)
coefs = {}
for coef, feature in zip(selection_model.estimator.coef_[0], selection_model.estimator.feature_names_in_):
coefs[feature] = coef
X_selected = selection_model.transform(X_train_preprocessed)
print("The dimension of the initial dataset: ", X_train_preprocessed.shape)
print("The dimension of the set after the selection of variables: ", X_selected.shape)
coefs
The dimension of the initial dataset: (6320, 26) The dimension of the set after the selection of variables: (6320, 26)
{'age heart_disease': -0.392978351567809,
'is_married log_glucose_no_outliers': -1.3696022295920751,
'age is_married': -0.20652025298948656,
'age hypertension': -0.5360502118541424,
'age log_glucose_no_outliers': 3.5426850504075413,
'age bmi_categorical_encoded': -1.8946165937878954,
'is_married': 4.113051514840918,
'age is_rural': -0.24886293590718095,
'bmi_categorical_encoded': -0.2721730623484636,
'log_glucose_no_outliers^2': 2.3209387643836163,
'is_female log_glucose_no_outliers': 0.5107077922181571,
'heart_disease log_glucose_no_outliers': 0.8344674630856949,
'heart_disease': -1.578378610652275,
'is_rural log_glucose_no_outliers': 0.3124455072056034,
'age^2': -1.762631227290451,
'hypertension log_glucose_no_outliers': 1.0446600053311177,
'work_type_encoded': 0.06631646886871642,
'work_type_encoded log_glucose_no_outliers': 0.2932920260518792,
'is_female': -0.548647375395707,
'age is_female': -0.2860917887792188,
'age work_type_encoded': -0.32965127277763884,
'bmi_categorical_encoded log_glucose_no_outliers': 1.2135344685285758,
'log_glucose_no_outliers': -3.2697571834287418,
'hypertension': -1.8745757969323777,
'is_rural': -0.3836178773413026,
'age': 3.299478869264678}
We see that no variable has been "removed" by LASSO by assigning zero weight.
The final step will be to tune the hyperparameters for each model. In logistic regression, we will choose values for the parameter $C$ responsible for regularization. For SVM, we will try different values of gamma, $C$ and linear kernel. For RFC, we will create a hyperparameter space that takes into account, among other things, the depth of the tree. We will use GridSearchCV from the sklearn package for this. The metric we will optimize will be the ROC AUC metric, which reflects the values shown in the graph above. The higher the AUC value, the higher the percentage of true-positive clsayifications with a low percentage of false-positive classifications. This metric allows for a good comparison of models and, like the previous ones chosen, correctly reflects our goal.
svm_grid = {'C': [1, 0.01, 0.0001],
'gamma': [1, 0.01, 0.0001],
'kernel': ['rbf', 'linear']}
rfc_grid = {
'bootstrap': [True],
'max_depth': [80, 100],
'max_features': [2, 3],
'min_samples_leaf': [3, 5],
'min_samples_split': [8,12],
'n_estimators': [100, 500, 1000]
}
# Create a based model
svm = SupportVectorMachineWithThreshold()
rfc = RandomForestClassifierWithThreshold(random_state = 123)
# Instantiate the grid search model
svm_search = GridSearchCV(estimator = svm, param_grid = svm_grid, scoring = 'f1',
cv = 3, n_jobs = -1, verbose = 2)
rfc_search = GridSearchCV(estimator = rfc, param_grid = rfc_grid, scoring = 'f1',
cv = 3, n_jobs = -1, verbose = 2)
for grid in [svm_search, rfc_search]:
grid.fit(X_selected,y)
print(grid.best_estimator_)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
SupportVectorMachineWithThreshold(C=1, gamma=1)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
min_samples_leaf=3, min_samples_split=8,
n_estimators=500, random_state=123)
So we got 3 models with the best parameters in terms of the ROC AUC metric from the searched parameter grids. Now we will put them together.
plt.figure(0).clf()
svm = SupportVectorMachineWithThreshold(gamma=1, C=1, kernel='rbf', probability=True, random_state = 123)
rfc = RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
min_samples_leaf=3, min_samples_split=8,
n_estimators=500, random_state=123)
#fit logistic regression model and plot ROC curve
lr.fit(X_selected, y)
svm.fit(X_selected, y)
rfc.fit(X_selected, y)
RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
min_samples_leaf=3, min_samples_split=8,
n_estimators=500, random_state=123)
<Figure size 800x550 with 0 Axes>
y_pred_lr = lr.predict(X_test_preprocessed)
y_pred_svm = svm.predict(X_test_preprocessed)
y_pred_rfc = rfc.predict(X_test_preprocessed)
fpr_lr, tpr_lr, _ = metrics.roc_curve(y_test, y_pred_lr)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr), 4)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, AUC="+str(auc))
fpr_svm, tpr_svm, _ = metrics.roc_curve(y_test, y_pred_svm)
auc = round(metrics.roc_auc_score(y_test, y_pred_svm), 4)
plt.plot(fpr_svm,tpr_svm,label="Support Vector Machine, AUC="+str(auc))
fpr_rfc, tpr_rfc, _ = metrics.roc_curve(y_test, y_pred_rfc)
auc = round(metrics.roc_auc_score(y_test, y_pred_rfc), 4)
plt.plot(fpr_rfc,tpr_rfc,label="Random Forest Classifier, AUC="+str(auc))
#add legend
plt.legend()
plt.show()
We see that among the best estimators for the selected parameter space, logistic regression performs best. In contrast, SVM and RFC perform only slightly better than the baseline estimator, i.e. predicting with a probability of 50%. It's like typing whether someone has a stroke by flipping a coin - SVM and RFC perform only slightly better for this scenario. So we'll limit ourselves to logistic regression and see if the choice of threshold $p$ affects our result.
for p in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
print('\nValue of p='+str(p)+' for Logistic Regression model\n')
model_name = 'Logistic Regression with p='+str(p)
results = fit_pred_score(X_selected, y, X_test_preprocessed, y_test, lr, dataset_name = dataset_name, model_name = model_name, scaler = False, ohe=False,
visualize_test = True, visualize_train = True, threshold = p)
report = pd.concat([report, results])
report.reset_index(inplace=True, drop=True)
report.sort_values(by=['model_name','metric','dataset_name'])
Value of p=0.1 for Logistic Regression model
Value of p=0.2 for Logistic Regression model
Value of p=0.3 for Logistic Regression model
Value of p=0.4 for Logistic Regression model
Value of p=0.5 for Logistic Regression model
Value of p=0.6 for Logistic Regression model
Value of p=0.7 for Logistic Regression model
Value of p=0.8 for Logistic Regression model
Value of p=0.9 for Logistic Regression model
| No stroke | Stroke | accuracy | macro avg | weighted avg | metric | dataset_name | model_name | split | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.840624 | 0.840233 | 0.840429 | 0.840429 | 0.840429 | f1-score | SMOTE | Base LR | train |
| 6 | 0.908547 | 0.246575 | 0.836892 | 0.577561 | 0.872818 | f1-score | SMOTE | Base LR | test |
| 18 | 0.821209 | 0.820244 | 0.820728 | 0.820727 | 0.820743 | f1-score | SMOTE+ENN | Base LR | train |
| 22 | 0.886301 | 0.265487 | 0.803084 | 0.575894 | 0.852794 | f1-score | SMOTE+ENN | Base LR | test |
| 10 | 0.842220 | 0.842171 | 0.842195 | 0.842195 | 0.842195 | f1-score | SMOTE+TOMEK | Base LR | train |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 92 | 0.948214 | 0.666667 | 0.947212 | 0.807440 | 0.933018 | precision | SMOTE+ENN | Logistic Regression with p=0.9 | test |
| 89 | 0.996937 | 0.284124 | 0.652373 | 0.640531 | 0.652373 | recall | SMOTE+ENN | Logistic Regression with p=0.9 | train |
| 93 | 0.998746 | 0.043956 | 0.947212 | 0.521351 | 0.947212 | recall | SMOTE+ENN | Logistic Regression with p=0.9 | test |
| 91 | 3265.000000 | 3055.000000 | 0.652373 | 6320.000000 | 6320.000000 | support | SMOTE+ENN | Logistic Regression with p=0.9 | train |
| 95 | 1595.000000 | 91.000000 | 0.947212 | 1686.000000 | 1686.000000 | support | SMOTE+ENN | Logistic Regression with p=0.9 | test |
96 rows × 9 columns
plot_datasets_scores = transform_report_to_plot(report[(report.model_name.str.contains('Logistic Regression with p')) & (report.split=='test')], x='model_name')
plotting(data = plot_datasets_scores, x='', y='value', cl='metric', type='lineplot')
0
report_diff_p = prepare_diff_dataset(report, by='model_name')
plot_diff_scores_p = transform_report_to_plot(report_diff_p[(report_diff_p.model_name.str.contains('Logistic Regression with p')) & (report_diff_p.train_test_diffaccuracy.isna()==False)], x='model_name', cl='metric', train_test=True)
plotting(data = plot_diff_scores_p, x='', y='value', cl='metric', type='lineplot')
print("Plot of the dependence of differences between training and test set values of selected metrics on p-values")
#plot_diff_scores_p
Plot of the dependence of differences between training and test set values of selected metrics on p-values
Applications for:
Precision values:
Recall values:
As we mentioned at the beginning, we want to maximize the Precision value for the non-impact class (0) and Recall for the impact class (1). In addition, high values of Precision for class (1) and Recall for class (0) will be an asset.
From a value of $p=0.2$ for LR, we can see a strong drop in Recall for the class with impact. At the point of decrease for class (0), the precision value is relatively high, and for class (1) there is a slight decrease relative to Recall. In addition, low $p$ values have low differences in metrics between the training and test set. They are also in line with the "domain sense," i.e., we want to be more attentive to the possibility of a stroke, and rather, a small probability of occurrence is already enough to classify a patient. Admittedly, the risk of false classification increases, but in the worst case, unnecessary costs will be incurred (for such a reason, the value cannot be too low either, since we will almost immediately classify everyone as having a stroke). On the other hand, too high a value for the required probability can put people's health and even lives at risk (this can be seen, for example, in the drastic drop in Recall for Class 1, which means that the number of people classified as having no stroke, where in fact there was one, has strongly increased).
We are interested in the values of $p \in (0.2, 0.3)$, because they have the high values of the Precision for (0) score and Recall for (1) score metrics that we established at the beginning. At the same time, these are the points at which the value of Precision for (1) and Recall for (0) begins to increase with $F_1$ score.
A value of $p=0.1$ would classify almost everyone as a stroke patient, which is an undesirable effect and no model is needed for such conclusions. In contrast, at values of $p > 0.3$ we see a strong drop in Recall for class 1, so we lose confidence that all stroke patients who should have been classified were indeed assigned to that class.
This leaves a decision between the higher values of the apriori set metrics and some domain intuition, since for $p=0.2$ there is also a risk of classifying too many "healthy" patients. So let's check the AUC values for these two parameters, which will tell us which scenario maximizes TPR with minimal FPR.
plt.figure(0).clf()
y_pred_lr1 = lr.predict(X_test_preprocessed, threshold=0.1)
y_pred_lr2 = lr.predict(X_test_preprocessed, threshold=0.2)
y_pred_lr3 = lr.predict(X_test_preprocessed, threshold=0.3)
y_pred_lr4 = lr.predict(X_test_preprocessed, threshold=0.4)
fpr_lr1, tpr_lr1, _ = metrics.roc_curve(y_test, y_pred_lr1)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr1), 4)
plt.plot(fpr_lr1,tpr_lr1,label="Logistic Regression with p=0.1, AUC="+str(auc))
fpr_lr2, tpr_lr2, _ = metrics.roc_curve(y_test, y_pred_lr2)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr2), 4)
plt.plot(fpr_lr2,tpr_lr2,label="Logistic Regression with p=0.2, AUC="+str(auc))
fpr_lr3, tpr_lr3, _ = metrics.roc_curve(y_test, y_pred_lr3)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr3), 4)
plt.plot(fpr_lr3,tpr_lr3,label="Logistic Regression with p=0.3, AUC="+str(auc))
fpr_lr4, tpr_lr4, _ = metrics.roc_curve(y_test, y_pred_lr4)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr4), 4)
plt.plot(fpr_lr4,tpr_lr4,label="Logistic Regression with p=0.4, AUC="+str(auc))
fpr_lr, tpr_lr, _ = metrics.roc_curve(y_test, y_pred_lr)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr), 4)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression with p=0.5, AUC="+str(auc))
#add legend
plt.legend()
plt.show()
As we can see, the intuitions excerpted in the previous commentary were confirmed.
We achieved $AUC=0.7652$ for $p=0.2$ which is better than the default value of the parameter $p=0.5$ for which $AUC=0.7314$, so there is some improvement in the model. We can also see the fact that the high values of the metrics chosen apriori rightly maximize the AUC value while apparently taking into account the risk of classifying most patients as stroke patients. Evidently, this is a good direction for an unbalanced set like ours.
For $p=0.3$ we have $AUC=0.7521$ which is also not a bad result and after consulting a specialist perhaps would have been chosen as it assigns a class with a higher probability level.
You can also see how with an increase in the parameter $p$ the value of $AUC$ decreases.
Finally, we choose $p=0.2$ as the value of our main model. As a final step, we will prepare a whole ready pipeline for stroke prediction.
#Save final model
filename = 'final_lr_no_smoking_status_model.sav'
joblib.dump(lr, filename)
['final_lr_no_smoking_status_model.sav']
def preprocess_final(X_new: pd.DataFrame, standard_scaler):
"""
Function for preprocessing raw data.
It performs even unwanted operations because of the lack of time to prepare cleaning the code.
"""
df_clear = X_new
### BMI
df_clear['bmi_no_nan'] = 28.1
df_clear['bmi_no_outliers'] = df_clear.bmi_no_nan
df_clear = iqr_outliers(df_clear, 'bmi_no_outliers')
df_clear.loc[(df_clear.bmi_no_outliers < 18.5), 'bmi_categorical_encoded'] = 17.0
df_clear.loc[(df_clear.bmi_no_outliers >= 18.5) & (df_clear.bmi_no_outliers < 24.9), 'bmi_categorical_encoded'] = 22.0
df_clear.loc[(df_clear.bmi_no_outliers >= 24.9) & (df_clear.bmi_no_outliers < 29.9), 'bmi_categorical_encoded'] = 27.0
df_clear.loc[(df_clear.bmi_no_outliers >= 29.9) & (df_clear.bmi_no_outliers < 34.9), 'bmi_categorical_encoded'] = 32.0
df_clear.loc[(df_clear.bmi_no_outliers >= 34.9), 'bmi_categorical_encoded'] = 38.0
### GENDER
df_clear = df_clear.loc[df_clear.gender != 'Other',:]
df_clear.loc[df_clear.gender == 'Female', 'is_female'] = 1
df_clear.loc[df_clear.gender == 'Male', 'is_female'] = 0
df_clear.is_female = df_clear.is_female.astype('int')
### AGE
df_clear = iqr_outliers(df=df_clear, col='age')
### EVER_MARRIED
df_clear.loc[df_clear.ever_married == 'Yes', 'is_married'] = 1
df_clear.loc[df_clear.ever_married == 'No', 'is_married'] = 0
df_clear.is_married = df_clear.is_married.astype('int')
### WORK_TYPE
le = preprocessing.LabelEncoder()
le.fit(df_clear.work_type)
df_clear['work_type_encoded'] = le.transform(df_clear.work_type)
### RESIDENCE_TYPE
df_clear.loc[df_clear.Residence_type == 'Rural', 'is_rural'] = 1
df_clear.loc[df_clear.Residence_type == 'Urban', 'is_rural'] = 0
df_clear.is_rural = df_clear.is_rural.astype('int')
### AVG_GLUCOSE_LEVEL
df_clear['log_glucose_no_outliers'] = np.log(df_clear.avg_glucose_level)
df_clear = iqr_outliers(df_clear, 'log_glucose_no_outliers')
### POLYNOMIAL FEATURES
poly = preprocessing.PolynomialFeatures(2)
#select only apriori chosen variables
X_poly = df_clear.select_dtypes(['int32','int64','float32','float64'])
X_poly = X_poly.loc[:,~X_poly.columns.isin(['id','smoking_status', 'bmi_no_nan','bmi','avg_glucose_level','glucose_no_outliers', 'log_bmi_no_outliers', 'bmi_no_outliers','glucose_categorical_encoded'])]
df_poly = pd.DataFrame(poly.fit_transform(X_poly), columns = poly.get_feature_names_out(X_poly.columns))
df_poly = df_poly[['age','age bmi_categorical_encoded','age heart_disease','age hypertension','age is_female',
'age is_married','age is_rural','age log_glucose_no_outliers','age work_type_encoded',
'age^2','bmi_categorical_encoded','bmi_categorical_encoded log_glucose_no_outliers','heart_disease','heart_disease log_glucose_no_outliers',
'hypertension','hypertension log_glucose_no_outliers','is_female','is_female log_glucose_no_outliers','is_married','is_married log_glucose_no_outliers',
'is_rural','is_rural log_glucose_no_outliers','log_glucose_no_outliers','log_glucose_no_outliers^2','work_type_encoded','work_type_encoded log_glucose_no_outliers']]
cat_features = ['is_rural','is_married', 'is_female', 'hypertension', 'heart_disease', 'work_type_encoded']
columns_to_standardize = list(set(list(df_poly.filter(regex='age')) + list(df_poly.filter(regex='log_glucose_no_outliers')))) + ['bmi_categorical_encoded'] #bmi jest skategoryzowana, ale medianą w celu zachowania pewnej informacji w wartości dlatego ją standaryzujemy
### STANDARDIZE + OHE
X_new_preprocessed = preprocess_data(None, df_poly, scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features, test_only=True, sc=standard_scaler)
return X_new_preprocessed
#X_new
def predict_stroke(X : pd.DataFrame(), lr_model, standard_scaler):
"""
Function predicts the strokes based on the given features using trained model.
"""
#create model with the best parameters
model = lr_model
#preprocess data
X_preprocessed = preprocess_final(X, standard_scaler)
predicted_strokes = model.predict(X_preprocessed, threshold=0.3)
return predicted_strokes
#Load previously trained model
loaded_model = joblib.load(filename)
loaded_model
LogisticRegressionWithThreshold(C=1, max_iter=10000, penalty='l1',
random_state=123, solver='saga')
#Predict strokes
predict_stroke(df.loc[0:5,:], loaded_model, standard_scaler)
array([0, 0, 0, 0, 0, 0])